Import¶
In [9]:
import time
import torch
import random
import numpy as np
import pandas as pd
from fsspec.compression import unzip
from matplotlib import pyplot as plt, ticker
from sklearn.cluster import KMeans, DBSCAN
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import classification_report, confusion_matrix, f1_score, recall_score, accuracy_score, \
precision_score, silhouette_score, silhouette_samples
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import seaborn as sns
from sklearn.svm import OneClassSVM
from torch import nn, optim
from torch.utils.data import TensorDataset, DataLoader
from tqdm import tqdm
import matplotlib.cm as cm
import torch.nn.functional as F
# Set seeds
seed = 69
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
Task 1: Dataset Characterization and Preprocessing¶
What are your dataset characteristics? How are your attack labels and binary_label distributed?¶
In [10]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")
In [11]:
train_data["protocol_type"].value_counts().plot(kind="bar", figsize=(10, 4))
plt.title("Number of Samples per Protocol")
plt.xlabel("Protocol")
plt.ylabel("Count")
plt.xticks(rotation=90)
plt.show()
plt.close()
train_data["service"].value_counts().plot(kind="bar", figsize=(10, 4))
plt.title("Number of Samples per service")
plt.xlabel("Service")
plt.ylabel("Count")
plt.xticks(rotation=90)
plt.show()
plt.close()
train_data["flag"].value_counts().plot(kind="bar", figsize=(10, 4))
plt.title("Number of Samples per flag")
plt.xlabel("flag")
plt.ylabel("Count")
plt.xticks(rotation=90)
plt.show()
plt.close()
How are your attack labels and binary_label distributed?¶
In [12]:
train_data["label"].value_counts().plot(kind="bar", figsize=(10, 4))
plt.title("Number of Samples per Label")
plt.xlabel("Label")
plt.ylabel("Count")
plt.xticks(rotation=90)
plt.show()
plt.close()
train_data["binary_label"].value_counts().plot(kind="bar", figsize=(10, 4))
plt.title("Number of Samples per Binary Label")
plt.xlabel("Label")
plt.ylabel("Count")
plt.xticks(rotation=90)
plt.show()
plt.close()
normal_samples = train_data[train_data["binary_label"] == 0].shape[0]
anomalies_samples = train_data[train_data["binary_label"] == 1].shape[0]
print(f"Percentage of normal samples: {(normal_samples / train_data.shape[0]) * 100:.2f} \nPercentage of anomalies samples: {(anomalies_samples / train_data.shape[0]) * 100:.2f}")
Percentage of normal samples: 71.41 Percentage of anomalies samples: 28.59
How many categorical and numerical attributes do you have?¶
In [13]:
# Get numerical and categorical attributes
numerical_attributes = train_data.select_dtypes(include=[np.number])
categorical_attributes = train_data.select_dtypes(exclude=[np.number])
print(f"There are {numerical_attributes.shape[1]} numerical attributes and {categorical_attributes.shape[1]} categorical attributes")
There are 39 numerical attributes and 4 categorical attributes
In [14]:
train_data.nunique()
Out[14]:
duration 750 protocol_type 3 service 65 flag 11 src_bytes 1657 dst_bytes 3901 land 2 wrong_fragment 3 urgent 1 hot 21 num_failed_logins 5 logged_in 2 num_compromised 27 root_shell 2 su_attempted 3 num_root 25 num_file_creations 19 num_shells 2 num_access_files 7 num_outbound_cmds 1 is_host_login 1 is_guest_login 2 count 466 srv_count 414 serror_rate 65 srv_serror_rate 43 rerror_rate 72 srv_rerror_rate 42 same_srv_rate 94 diff_srv_rate 79 srv_diff_host_rate 57 dst_host_count 256 dst_host_srv_count 256 dst_host_same_srv_rate 101 dst_host_diff_srv_rate 101 dst_host_same_src_port_rate 101 dst_host_srv_diff_host_rate 63 dst_host_serror_rate 100 dst_host_srv_serror_rate 76 dst_host_rerror_rate 101 dst_host_srv_rerror_rate 100 label 4 binary_label 2 dtype: int64
Preprocessing¶
Drop useless features¶
In [15]:
# We drop these features because: urgent, num_outbound_cmds, is_host_login are always 0; land is 1 only for one entry. So these features do not give any information to the model
features_to_drop = ["urgent", "num_outbound_cmds", "is_host_login", "land"]
train_data = train_data.drop(columns=features_to_drop)
test_data = test_data.drop(columns=features_to_drop)
In [16]:
# Calculate correlation matrix for numerical features
correlation_matrix = train_data.select_dtypes(include=[np.number]).corr()
# Create heatmap
plt.figure(figsize=(20, 16))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Correlation Matrix of Numerical Features')
plt.tight_layout()
plt.show()
plt.close()
In [929]:
# Deleting the ones with |p|>=0.8. We try to limit curse of dimensionality
#train_data = train_data.drop(['hot', 'serror_rate', 'dst_host_serror_rate','dst_host_srv_serror_rate', 'rerror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'dst_host_same_srv_rate'], axis=1)
#test_data = test_data.drop(['hot', 'serror_rate', 'dst_host_serror_rate','dst_host_srv_serror_rate', 'rerror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'dst_host_same_srv_rate'], axis=1)
In [17]:
# Removing duplicates
previous_train_length = len(train_data)
previous_test_length = len(test_data)
test_data = test_data.drop_duplicates()
train_data = train_data.drop_duplicates()
print(f"Removed {previous_train_length - len(train_data)} duplicates from training set")
print(f"Removed {previous_test_length - len(test_data)} duplicates from test set")
# Remove missing values
previous_train_length = len(train_data)
previous_test_length = len(test_data)
test_data = test_data.dropna()
train_data = train_data.dropna()
print(f"Removed {previous_train_length - len(train_data)} missing values from training set")
print(f"Removed {previous_test_length - len(test_data)} missing values from test set")
# Remove infinite and negative values
numerical_attributes_train = train_data.select_dtypes(include=[np.number])
numerical_attributes_test = test_data.select_dtypes(include=[np.number])
previous_train_length = len(numerical_attributes_train)
previous_test_length = len(numerical_attributes_test)
numerical_attributes_train = train_data[(numerical_attributes.iloc[:, :-2] >= 0).all(axis=1) & np.isfinite(numerical_attributes.iloc[:, :-1]).all(axis=1)]
numerical_attributes_test = test_data[(numerical_attributes_test.iloc[:, :-2] >= 0).all(axis=1) & np.isfinite(numerical_attributes_test.iloc[:, :-1]).all(axis=1)]
actual_length = len(numerical_attributes)
print(f"Removed {previous_train_length - len(numerical_attributes_train)} infinite and negative values from training set")
print(f"Removed {previous_test_length - len(numerical_attributes_test)} infinite and negative values from test set")
Removed 0 duplicates from training set Removed 0 duplicates from test set Removed 0 missing values from training set Removed 0 missing values from test set Removed 0 infinite and negative values from training set Removed 0 infinite and negative values from test set
Reduce the number of possible categories for "flag" and "service" categorical attributes¶
In [18]:
service_percentage = train_data['service'].value_counts(normalize=True) * 100 # Get percentage of service values
flag_percentage = train_data['flag'].value_counts(normalize=True) * 100 # Get percentage of flag values
services_over_1pct = service_percentage[service_percentage > 1] # Get services with percentage > 1
flags_over_1pct = flag_percentage[flag_percentage > 1] # Get flags with percentage > 1
print(services_over_1pct)
print(flags_over_1pct)
allowed_services = ['http', 'private', 'domain_u', 'smtp', 'ftp_data', 'echo_i', 'ecr_it', 'telnet', 'ftp', 'finger']
allowed_flags = ['SF', 'S0', 'REJ', 'RSTR']
train_data["service"] = train_data["service"].apply(lambda x: x if x in allowed_services else "other")
test_data["service"] = test_data["service"].apply(lambda x: x if x in allowed_services else "other")
train_data["flag"] = train_data["flag"].apply(lambda x: x if x in allowed_flags else "other")
test_data["flag"] = test_data["flag"].apply(lambda x: x if x in allowed_flags else "other")
service http 41.585683 private 10.811959 domain_u 9.664914 smtp 7.492964 ftp_data 6.324677 eco_i 4.827147 other 4.540386 ecr_i 3.064096 telnet 1.423185 ftp 1.396633 finger 1.024906 Name: proportion, dtype: float64 flag SF 79.162020 S0 9.372843 REJ 7.216823 RSTR 2.639265 Name: proportion, dtype: float64
In [19]:
service_percentage_train = train_data['service'].value_counts(normalize=True) * 100 # Get percentage of service values
service_percentage_test = test_data['service'].value_counts(normalize=True) * 100 # Get percentage of service values
flag_percentage_train = train_data['flag'].value_counts(normalize=True) * 100 # Get percentage of flag values
flag_percentage_test = test_data['flag'].value_counts(normalize=True) * 100 # Get percentage of flag values
print("TRAINING SET")
print(service_percentage_train)
print(flag_percentage_train)
print("\nTEST SET")
print(service_percentage_test)
print(flag_percentage_test)
TRAINING SET service http 41.585683 other 20.275078 private 10.811959 domain_u 9.664914 smtp 7.492964 ftp_data 6.324677 telnet 1.423185 ftp 1.396633 finger 1.024906 Name: proportion, dtype: float64 flag SF 79.162020 S0 9.372843 REJ 7.216823 RSTR 2.639265 other 1.609049 Name: proportion, dtype: float64 TEST SET service other 44.181256 private 23.498112 http 11.328527 domain_u 9.423275 ftp_data 4.273944 telnet 2.797803 smtp 2.128390 finger 1.390319 ftp 0.978373 Name: proportion, dtype: float64 flag SF 59.079986 REJ 21.198078 S0 10.298661 other 6.333677 RSTR 3.089598 Name: proportion, dtype: float64
Split datasets¶
In [20]:
y = train_data.iloc[:, -1].values # Get label column
X_train, X_val, y_train, y_val = train_test_split(train_data, y, train_size=0.8, stratify=train_data["label"],random_state=seed)
X_test = test_data
y_test = test_data.iloc[:, -1].values # Get label column
Standardization and encoding¶
In [21]:
#Perform feature normalization and process categorical and numerical features simultaneously
# Set up column groups
ignore_cols = ['label', 'binary_label'] # columns to skip entirely
categorical_cols = ['protocol_type', 'service', 'flag']
numerical_cols = [col for col in train_data.columns if col not in categorical_cols + ignore_cols]
# Define ColumnTransformer
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), numerical_cols),
('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
],
remainder='drop' # drop columns not listed (i.e., ignore_cols)
)
# Fit on training, transform all splits
X_train_transformed = preprocessor.fit_transform(pd.DataFrame(X_train))
X_val_transformed = preprocessor.transform(pd.DataFrame(X_val))
X_test_transformed = preprocessor.transform(pd.DataFrame(X_test))
# Get transformed feature names and make DataFrames
cat_feature_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_cols)
all_feature_names = list(numerical_cols) + list(cat_feature_names)
df_train_preprocessed = pd.DataFrame(X_train_transformed, columns=all_feature_names)
df_val_preprocessed = pd.DataFrame(X_val_transformed, columns=all_feature_names)
df_test_preprocessed = pd.DataFrame(X_test_transformed, columns=all_feature_names)
# Restore the labels
df_train_preprocessed["label"] = list(X_train["label"])
df_val_preprocessed["label"] = list(X_val["label"])
df_test_preprocessed["label"] = list(X_test["label"])
df_train_preprocessed["binary_label"] = list(X_train["binary_label"])
df_val_preprocessed["binary_label"] = list(X_val["binary_label"])
df_test_preprocessed["binary_label"] = list(X_test["binary_label"])
Study your data from a domain expert perspective¶
Mean heatmap¶
In [22]:
feature_mean_per_label = pd.concat([df_train_preprocessed.select_dtypes(include=[np.number]), df_train_preprocessed['label']],axis=1).groupby(['label']).mean()
feature_mean_per_label.style.format("{:.3f}").background_gradient(cmap='Blues')
Out[22]:
| Â | duration | src_bytes | dst_bytes | wrong_fragment | hot | num_failed_logins | logged_in | num_compromised | root_shell | su_attempted | num_root | num_file_creations | num_shells | num_access_files | is_guest_login | count | srv_count | serror_rate | srv_serror_rate | rerror_rate | srv_rerror_rate | same_srv_rate | diff_srv_rate | srv_diff_host_rate | dst_host_count | dst_host_srv_count | dst_host_same_srv_rate | dst_host_diff_srv_rate | dst_host_same_src_port_rate | dst_host_srv_diff_host_rate | dst_host_serror_rate | dst_host_srv_serror_rate | dst_host_rerror_rate | dst_host_srv_rerror_rate | protocol_type_icmp | protocol_type_tcp | protocol_type_udp | service_domain_u | service_finger | service_ftp | service_ftp_data | service_http | service_other | service_private | service_smtp | service_telnet | flag_REJ | flag_RSTR | flag_S0 | flag_SF | flag_other | binary_label |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| label | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â |
| dos | -0.133 | -0.041 | -0.051 | 0.560 | -0.055 | -0.023 | -0.921 | -0.020 | -0.042 | -0.031 | -0.026 | -0.031 | -0.020 | -0.049 | -0.112 | 1.254 | 0.553 | 1.584 | 1.581 | 0.064 | 0.067 | -1.383 | -0.044 | -0.411 | 0.738 | -0.897 | -1.014 | -0.109 | -0.159 | -0.330 | 1.602 | 1.616 | 0.135 | 0.053 | 0.182 | 0.751 | 0.067 | 0.000 | 0.023 | 0.008 | 0.019 | 0.088 | 0.520 | 0.320 | 0.004 | 0.019 | 0.093 | 0.009 | 0.571 | 0.306 | 0.021 | 1.000 |
| normal | -0.076 | -0.007 | 0.022 | -0.102 | -0.012 | 0.002 | 0.364 | 0.009 | 0.017 | 0.012 | 0.011 | 0.009 | 0.008 | 0.020 | 0.007 | -0.297 | -0.068 | -0.303 | -0.299 | -0.203 | -0.203 | 0.361 | -0.139 | -0.007 | -0.122 | 0.372 | 0.330 | -0.217 | -0.215 | -0.134 | -0.305 | -0.309 | -0.205 | -0.204 | 0.019 | 0.793 | 0.187 | 0.136 | 0.009 | 0.014 | 0.074 | 0.563 | 0.075 | 0.013 | 0.102 | 0.015 | 0.037 | 0.002 | 0.006 | 0.942 | 0.012 | 0.000 |
| probe | 0.606 | -0.060 | -0.060 | -0.102 | -0.105 | 0.021 | -1.041 | -0.026 | -0.042 | -0.031 | -0.026 | -0.010 | -0.020 | -0.049 | -0.107 | 0.186 | -0.275 | -0.210 | -0.227 | 1.136 | 1.135 | -0.396 | 0.894 | 0.596 | -0.164 | -0.966 | -0.663 | 1.441 | 1.353 | 1.174 | -0.223 | -0.214 | 1.061 | 1.156 | 0.359 | 0.496 | 0.145 | 0.002 | 0.004 | 0.003 | 0.005 | 0.002 | 0.566 | 0.411 | 0.005 | 0.002 | 0.239 | 0.197 | 0.013 | 0.518 | 0.033 | 1.000 |
| r2l | 0.104 | 1.915 | -0.047 | -0.102 | 3.072 | -0.023 | 0.948 | -0.027 | -0.042 | -0.031 | -0.026 | -0.031 | -0.020 | -0.049 | 2.581 | -0.494 | -0.387 | -0.330 | -0.317 | -0.343 | -0.345 | 0.450 | -0.269 | -0.385 | -0.762 | -0.974 | 0.215 | -0.351 | 1.409 | 0.434 | -0.304 | -0.323 | -0.346 | -0.346 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.297 | 0.703 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.993 | 0.007 | 1.000 |
Standard deviation heatmap¶
In [23]:
feature_mean_per_label = pd.concat(
[df_train_preprocessed.select_dtypes(include=[np.number]), df_train_preprocessed['label']], axis=1).groupby(['label']).std()
feature_mean_per_label.style.format("{:.3f}").background_gradient(cmap='Blues')
Out[23]:
| Â | duration | src_bytes | dst_bytes | wrong_fragment | hot | num_failed_logins | logged_in | num_compromised | root_shell | su_attempted | num_root | num_file_creations | num_shells | num_access_files | is_guest_login | count | srv_count | serror_rate | srv_serror_rate | rerror_rate | srv_rerror_rate | same_srv_rate | diff_srv_rate | srv_diff_host_rate | dst_host_count | dst_host_srv_count | dst_host_same_srv_rate | dst_host_diff_srv_rate | dst_host_same_src_port_rate | dst_host_srv_diff_host_rate | dst_host_serror_rate | dst_host_srv_serror_rate | dst_host_rerror_rate | dst_host_srv_rerror_rate | protocol_type_icmp | protocol_type_tcp | protocol_type_udp | service_domain_u | service_finger | service_ftp | service_ftp_data | service_http | service_other | service_private | service_smtp | service_telnet | flag_REJ | flag_RSTR | flag_S0 | flag_SF | flag_other | binary_label |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| label | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â |
| dos | 0.000 | 0.068 | 0.040 | 2.469 | 0.192 | 0.000 | 0.501 | 0.025 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.293 | 1.902 | 1.656 | 1.667 | 1.079 | 1.064 | 1.290 | 0.310 | 0.234 | 0.501 | 0.701 | 0.838 | 0.424 | 0.872 | 0.066 | 1.671 | 1.696 | 1.174 | 1.079 | 0.386 | 0.432 | 0.251 | 0.000 | 0.150 | 0.088 | 0.136 | 0.283 | 0.500 | 0.466 | 0.062 | 0.136 | 0.291 | 0.095 | 0.495 | 0.461 | 0.145 | 0.000 |
| normal | 0.416 | 0.844 | 1.182 | 0.000 | 0.969 | 0.891 | 0.910 | 1.183 | 1.183 | 1.183 | 1.183 | 1.131 | 1.183 | 1.183 | 1.033 | 0.495 | 0.724 | 0.335 | 0.310 | 0.666 | 0.657 | 0.439 | 0.747 | 0.928 | 0.979 | 0.862 | 0.783 | 0.603 | 0.748 | 0.523 | 0.320 | 0.211 | 0.685 | 0.630 | 0.138 | 0.405 | 0.390 | 0.343 | 0.095 | 0.116 | 0.261 | 0.496 | 0.263 | 0.115 | 0.303 | 0.120 | 0.190 | 0.043 | 0.078 | 0.233 | 0.111 | 0.000 |
| probe | 2.596 | 0.001 | 0.025 | 0.000 | 0.019 | 1.887 | 0.168 | 0.010 | 0.000 | 0.000 | 0.009 | 0.844 | 0.000 | 0.000 | 0.212 | 1.446 | 0.211 | 0.535 | 0.596 | 1.609 | 1.644 | 1.268 | 1.976 | 1.581 | 1.156 | 0.620 | 1.133 | 1.867 | 1.241 | 2.223 | 0.447 | 0.606 | 1.520 | 1.680 | 0.480 | 0.500 | 0.352 | 0.040 | 0.062 | 0.057 | 0.074 | 0.040 | 0.496 | 0.492 | 0.070 | 0.040 | 0.427 | 0.398 | 0.111 | 0.500 | 0.178 | 0.000 |
| r2l | 0.770 | 6.900 | 0.022 | 0.000 | 4.910 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 4.161 | 0.004 | 0.008 | 0.279 | 0.283 | 0.000 | 0.000 | 0.082 | 0.276 | 0.411 | 1.077 | 0.303 | 0.836 | 0.124 | 1.335 | 0.706 | 0.299 | 0.029 | 0.085 | 0.006 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.458 | 0.458 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.083 | 0.083 | 0.000 |
Median heatmap¶
In [24]:
feature_mean_per_label = pd.concat([df_train_preprocessed.select_dtypes(include=[np.number]), df_train_preprocessed['label']], axis=1).groupby(['label']).median()
feature_mean_per_label.style.format("{:.3f}").background_gradient(cmap='Blues')
Out[24]:
| Â | duration | src_bytes | dst_bytes | wrong_fragment | hot | num_failed_logins | logged_in | num_compromised | root_shell | su_attempted | num_root | num_file_creations | num_shells | num_access_files | is_guest_login | count | srv_count | serror_rate | srv_serror_rate | rerror_rate | srv_rerror_rate | same_srv_rate | diff_srv_rate | srv_diff_host_rate | dst_host_count | dst_host_srv_count | dst_host_same_srv_rate | dst_host_diff_srv_rate | dst_host_same_src_port_rate | dst_host_srv_diff_host_rate | dst_host_serror_rate | dst_host_srv_serror_rate | dst_host_rerror_rate | dst_host_srv_rerror_rate | protocol_type_icmp | protocol_type_tcp | protocol_type_udp | service_domain_u | service_finger | service_ftp | service_ftp_data | service_http | service_other | service_private | service_smtp | service_telnet | flag_REJ | flag_RSTR | flag_S0 | flag_SF | flag_other | binary_label |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| label | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â | Â |
| dos | -0.133 | -0.060 | -0.061 | -0.102 | -0.106 | -0.023 | -1.055 | -0.027 | -0.042 | -0.031 | -0.026 | -0.031 | -0.020 | -0.049 | -0.112 | 1.097 | -0.233 | 3.013 | 3.032 | -0.343 | -0.345 | -2.202 | 0.005 | -0.439 | 0.909 | -1.219 | -1.447 | -0.131 | -0.574 | -0.334 | 3.052 | 3.099 | -0.364 | -0.347 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 1.000 |
| normal | -0.133 | -0.059 | -0.054 | -0.102 | -0.106 | -0.023 | 0.948 | -0.027 | -0.042 | -0.031 | -0.026 | -0.031 | -0.020 | -0.049 | -0.112 | -0.469 | -0.330 | -0.353 | -0.344 | -0.343 | -0.345 | 0.457 | -0.292 | -0.439 | -0.037 | 0.980 | 0.786 | -0.409 | -0.544 | -0.334 | -0.354 | -0.333 | -0.364 | -0.347 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 |
| probe | -0.133 | -0.060 | -0.061 | -0.102 | -0.106 | -0.023 | -1.055 | -0.027 | -0.042 | -0.031 | -0.026 | -0.031 | -0.020 | -0.049 | -0.112 | -0.497 | -0.379 | -0.353 | -0.344 | -0.343 | -0.345 | 0.457 | -0.292 | -0.439 | 0.793 | -1.357 | -1.567 | 0.796 | 2.336 | -0.334 | -0.354 | -0.333 | 0.356 | -0.347 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 1.000 |
| r2l | -0.133 | -0.058 | -0.061 | -0.102 | -0.106 | -0.023 | 0.948 | -0.027 | -0.042 | -0.031 | -0.026 | -0.031 | -0.020 | -0.049 | -0.112 | -0.497 | -0.391 | -0.353 | -0.344 | -0.343 | -0.345 | 0.457 | -0.292 | -0.439 | -1.476 | -1.034 | 0.786 | -0.409 | 2.336 | 0.526 | -0.354 | -0.333 | -0.364 | -0.347 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 1.000 |
Task 2: Shallow Anomaly Detection - Supervised vs Unsupervised¶
In [25]:
def plot_confusion_matrix(y_binary, y_pred, methodology, data_name):
print(f"\nClassification Report ({methodology} vs Ground Truth):")
print(classification_report(y_binary, y_pred))
# Confusion matrix
cm = confusion_matrix(y_binary, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Normal','Anomaly'], yticklabels=['Normal','Anomaly'])
plt.title('Confusion Matrix %s with %s'%(data_name,methodology))
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()
plt.close()
In [26]:
df_train_normal_only = df_train_preprocessed[df_train_preprocessed["binary_label"]==0].copy()
X_train_normal_only = df_train_normal_only.drop(columns=['label', 'binary_label'])
y_train_normal_only = list(df_train_normal_only['binary_label'])
X_train = df_train_preprocessed.drop(columns=['label', 'binary_label'])
y_original_train = list(df_train_preprocessed['label'])
y_train = list(df_train_preprocessed['binary_label'])
X_val = df_val_preprocessed.drop(columns=['label', 'binary_label'])
y_original_val = list(df_val_preprocessed['label'])
y_val = list(df_val_preprocessed['binary_label'])
X_test = df_test_preprocessed.drop(columns=['label', 'binary_label'])
y_original_test = list(df_test_preprocessed['label'])
y_test = list(df_test_preprocessed['binary_label'])
One-Class SVM with Normal data only¶
Default parameters¶
In [27]:
# Fit One-Class SVM
ocsvm_normal_only = OneClassSVM(kernel='rbf')
ocsvm_normal_only.fit(X_train_normal_only)
# Inference
# Predict returns 1 for inliers, -1 for outliers
# Convert to binary anomaly labels: 1 = anomaly, 0 = normal
y_train_pred = ocsvm_normal_only.predict(X_train)
y_train_pred_binary = (y_train_pred == -1).astype(int)
y_val_pred = ocsvm_normal_only.predict(X_val)
y_val_pred_binary = (y_val_pred == -1).astype(int)
y_test_pred = ocsvm_normal_only.predict(X_test)
y_test_pred_binary = (y_test_pred == -1).astype(int)
plot_confusion_matrix(y_train, y_train_pred_binary, "O-SVM trained on normal data and default params", "Training")
plot_confusion_matrix(y_val, y_val_pred_binary, "O-SVM trained on normal data and default params", "Validation")
plot_confusion_matrix(y_test, y_test_pred_binary, "O-SVM trained on normal data and default params", "Test")
Classification Report (O-SVM trained on normal data and default params vs Ground Truth):
precision recall f1-score support
0 0.99 0.50 0.66 10758
1 0.44 0.99 0.61 4306
accuracy 0.64 15064
macro avg 0.72 0.74 0.64 15064
weighted avg 0.83 0.64 0.65 15064
Classification Report (O-SVM trained on normal data and default params vs Ground Truth):
precision recall f1-score support
0 0.99 0.50 0.67 2690
1 0.44 0.99 0.61 1077
accuracy 0.64 3767
macro avg 0.72 0.75 0.64 3767
weighted avg 0.83 0.64 0.65 3767
Classification Report (O-SVM trained on normal data and default params vs Ground Truth):
precision recall f1-score support
0 0.92 0.08 0.15 2152
1 0.65 1.00 0.79 3674
accuracy 0.66 5826
macro avg 0.78 0.54 0.47 5826
weighted avg 0.75 0.66 0.55 5826
Parameters tuning¶
In [28]:
nu_values = [0.01, 0.05, 0.001, 0.005, 0.0001, 0.0005]
gamma_values = [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 'auto', 'scale']
best_f1 = 0
best_pars = None
for pars in ParameterGrid({'nu':nu_values, 'gamma':gamma_values}):
oc = OneClassSVM(kernel='rbf', **pars).fit(X_train_normal_only)
# predict on validation set (contains normals + attacks)
y_val_pred = (oc.predict(X_val) == -1).astype(int)
f1 = f1_score(y_val, y_val_pred)
recall = recall_score(y_val, y_val_pred)
if f1 > best_f1:
best_f1 = f1
best_pars = pars
print(f"Best params: {best_pars}, best F1: {best_f1:.2f}")
Best params: {'gamma': 0.1, 'nu': 0.001}, best F1: 0.90
In [29]:
# Fit One-Class SVM
ocsvm_normal_only = OneClassSVM(kernel='rbf', gamma=best_pars['gamma'], nu=best_pars['nu'])
ocsvm_normal_only.fit(X_train_normal_only)
# Inference
y_train_pred = ocsvm_normal_only.predict(X_train)
y_train_pred_binary = (y_train_pred == -1).astype(int)
y_val_pred = ocsvm_normal_only.predict(X_val)
y_val_pred_binary = (y_val_pred == -1).astype(int)
y_test_pred = ocsvm_normal_only.predict(X_test)
y_test_pred_binary = (y_test_pred == -1).astype(int)
plot_confusion_matrix(y_train, y_train_pred_binary, "O-SVM trained on normal data", "Training")
plot_confusion_matrix(y_val, y_val_pred_binary, "O-SVM trained on normal data", "Validation")
plot_confusion_matrix(y_test, y_test_pred_binary, "O-SVM trained on normal data", "Test")
Classification Report (O-SVM trained on normal data vs Ground Truth):
precision recall f1-score support
0 0.95 0.99 0.97 10758
1 0.96 0.87 0.91 4306
accuracy 0.95 15064
macro avg 0.96 0.93 0.94 15064
weighted avg 0.95 0.95 0.95 15064
Classification Report (O-SVM trained on normal data vs Ground Truth):
precision recall f1-score support
0 0.95 0.97 0.96 2690
1 0.93 0.88 0.90 1077
accuracy 0.95 3767
macro avg 0.94 0.93 0.93 3767
weighted avg 0.95 0.95 0.95 3767
Classification Report (O-SVM trained on normal data vs Ground Truth):
precision recall f1-score support
0 0.74 0.64 0.68 2152
1 0.80 0.87 0.83 3674
accuracy 0.78 5826
macro avg 0.77 0.75 0.76 5826
weighted avg 0.78 0.78 0.78 5826
One-Class SVM with All data¶
In [30]:
normal_samples = df_train_preprocessed[df_train_preprocessed["binary_label"] == 0].shape[0]
anomalies_samples = df_train_preprocessed[df_train_preprocessed["binary_label"] == 1].shape[0]
anomalous_samples_perc = round((anomalies_samples / df_train_preprocessed.shape[0]),2)
print(f"Percentage of normal samples: {(normal_samples / df_train_preprocessed.shape[0]) * 100:.2f} \nPercentage of anomalies samples: {(anomalies_samples / df_train_preprocessed.shape[0]) * 100:.2f}")
Percentage of normal samples: 71.42 Percentage of anomalies samples: 28.58
In [31]:
gamma_values = [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 'auto', 'scale']
best_f1 = 0
best_gamma = None
for gamma in gamma_values:
oc = OneClassSVM(kernel='rbf', nu=anomalous_samples_perc, gamma=gamma).fit(X_train)
# predict on validation set (contains normals + attacks)
y_val_pred = (oc.predict(X_val) == -1).astype(int)
f1 = f1_score(y_val, y_val_pred)
recall = recall_score(y_val, y_val_pred)
if f1 > best_f1:
best_f1 = f1
best_gamma = gamma
print(f"Best gamma: {best_gamma}, best F1: {best_f1:.2f}")
Best gamma: 1e-05, best F1: 0.78
In [32]:
# Fit One-Class SVM
ocsvm_all_data = OneClassSVM(kernel='rbf', gamma=best_gamma, nu=anomalous_samples_perc)
ocsvm_all_data.fit(X_train)
# Inference
# Predict returns 1 for inliers, -1 for outliers
# Convert to binary anomaly labels: 1 = anomaly, 0 = normal
y_train_pred = ocsvm_all_data.predict(X_train)
y_train_pred_binary = (y_train_pred == -1).astype(int)
y_val_pred = ocsvm_all_data.predict(X_val)
y_val_pred_binary = (y_val_pred == -1).astype(int)
y_test_pred = ocsvm_all_data.predict(X_test)
y_test_pred_binary = (y_test_pred == -1).astype(int)
plot_confusion_matrix(y_train, y_train_pred_binary, "O-SVM trained on entire train set", "Training")
plot_confusion_matrix(y_val, y_val_pred_binary, "O-SVM trained on entire train set", "Validation")
plot_confusion_matrix(y_test, y_test_pred_binary, "O-SVM trained on entire train set", "Test")
Classification Report (O-SVM trained on entire train set vs Ground Truth):
precision recall f1-score support
0 0.91 0.90 0.91 10758
1 0.76 0.78 0.77 4306
accuracy 0.87 15064
macro avg 0.84 0.84 0.84 15064
weighted avg 0.87 0.87 0.87 15064
Classification Report (O-SVM trained on entire train set vs Ground Truth):
precision recall f1-score support
0 0.92 0.90 0.91 2690
1 0.77 0.79 0.78 1077
accuracy 0.87 3767
macro avg 0.84 0.85 0.84 3767
weighted avg 0.87 0.87 0.87 3767
Classification Report (O-SVM trained on entire train set vs Ground Truth):
precision recall f1-score support
0 0.63 0.68 0.66 2152
1 0.80 0.77 0.79 3674
accuracy 0.74 5826
macro avg 0.72 0.72 0.72 5826
weighted avg 0.74 0.74 0.74 5826
One-Class SVM with normal traffic and some anomalies¶
In [33]:
X_train_only_normal = df_train_preprocessed.drop(columns=['label'])[df_train_preprocessed["binary_label"] == 0]
X_train_only_anomalous = df_train_preprocessed.drop(columns=['label'])[df_train_preprocessed["binary_label"] == 1]
y_train_only_normal = list(X_train_only_normal["binary_label"])
y_train_only_anomalous = list(X_train_only_anomalous["binary_label"])
f1_scores_train = []
f1_scores_validation = []
f1_scores_test = []
for anomalyPercentage in [0, 0.1, 0.2, 0.5, 1]:
print(f"\nTrying anomaly percentage: {anomalyPercentage}")
if anomalyPercentage == 0:
X_train_anomalous_portion = pd.DataFrame()
y_train_anomalous_portion = []
elif anomalyPercentage == 1:
X_train_anomalous_portion = X_train_only_anomalous
y_train_anomalous_portion = y_train_only_anomalous
else:
X_train_anomalous_portion, _, y_train_anomalous_portion, _ = train_test_split(X_train_only_anomalous, y_train_only_anomalous, train_size=anomalyPercentage, stratify=X_train_only_anomalous['binary_label'],random_state=seed)
X_train = pd.concat([X_train_only_normal, X_train_anomalous_portion], axis=0).drop(columns=['binary_label'])
y_train = pd.concat([pd.DataFrame(y_train_only_normal),pd.DataFrame(y_train_anomalous_portion)], axis=0).squeeze() # squeeze needed otherwise I cannot do the thing of y_train[y_train==1] because it would not change result set length because will give me NaN for non -matching instead of just not returning them
if anomalyPercentage == 0:
nu = 0.001
gamma = 0.1
else:
nu = y_train[y_train==1].shape[0]/y_train.shape[0]
gamma = 1e-5
ocsvm = OneClassSVM(kernel='rbf', gamma=gamma, nu=nu)
ocsvm.fit(X_train)
y_train_pred = ocsvm.predict(df_train_preprocessed.drop(columns=['label', 'binary_label']))
y_train_pred_binary = (y_train_pred == -1).astype(int)
X_val = df_val_preprocessed.drop(columns=['label', 'binary_label'])
X_test = df_test_preprocessed.drop(columns=['label', 'binary_label'])
y_val_pred = ocsvm.predict(X_val)
y_val_pred_binary = (y_val_pred == -1).astype(int)
y_test_pred = ocsvm.predict(X_test)
y_test_pred_binary = (y_test_pred == -1).astype(int)
plot_confusion_matrix(df_train_preprocessed['binary_label'], y_train_pred_binary, f"O-SVM trained with normal traffic and {anomalyPercentage*100}% anomalies", "Training")
plot_confusion_matrix(y_val, y_val_pred_binary, "O-SVM", "Validation")
plot_confusion_matrix(y_test, y_test_pred_binary, "O-SVM", "Test")
f1_train = f1_score(df_train_preprocessed['binary_label'], y_train_pred_binary, average='macro')
f1_val = f1_score(y_val, y_val_pred_binary, average='macro')
f1_test = f1_score(y_test, y_test_pred_binary, average='macro')
f1_scores_train.append(f1_train)
f1_scores_validation.append(f1_val)
f1_scores_test.append(f1_test)
# Plot f1 scores
plt.figure()
plt.plot([0, 0.1, 0.2, 0.5, 1], f1_scores_train, marker='o', label='Training F1')
plt.plot([0, 0.1, 0.2, 0.5, 1], f1_scores_validation, marker='s', label='Validation F1')
plt.plot([0, 0.1, 0.2, 0.5, 1], f1_scores_test, marker='^', label='Test F1')
plt.xlabel('Anomaly Percentage in Training Set')
plt.ylabel('F1 Macro Score on Test Set')
plt.title('F1 Macro Score For Each Anomaly Percentage')
plt.grid(True)
plt.legend()
plt.show()
# 1) Build the DataFrame
df_results = pd.DataFrame({
'anomaly_percentage': [0, 0.1, 0.2, 0.5, 1],
'F1 train': f1_scores_train,
'F1 val': f1_scores_validation,
'F1test': f1_scores_test
}).round(2)
# 2) Print as a Markdown table (works nicely in Jupyter/VSCode etc.)
display(df_results)
Trying anomaly percentage: 0
Classification Report (O-SVM trained with normal traffic and 0% anomalies vs Ground Truth):
precision recall f1-score support
0 0.95 0.99 0.97 10758
1 0.96 0.87 0.91 4306
accuracy 0.95 15064
macro avg 0.96 0.93 0.94 15064
weighted avg 0.95 0.95 0.95 15064
Classification Report (O-SVM vs Ground Truth):
precision recall f1-score support
0 0.95 0.97 0.96 2690
1 0.93 0.88 0.90 1077
accuracy 0.95 3767
macro avg 0.94 0.93 0.93 3767
weighted avg 0.95 0.95 0.95 3767
Classification Report (O-SVM vs Ground Truth):
precision recall f1-score support
0 0.74 0.64 0.68 2152
1 0.80 0.87 0.83 3674
accuracy 0.78 5826
macro avg 0.77 0.75 0.76 5826
weighted avg 0.78 0.78 0.78 5826
Trying anomaly percentage: 0.1
Classification Report (O-SVM trained with normal traffic and 10.0% anomalies vs Ground Truth):
precision recall f1-score support
0 0.78 0.97 0.87 10758
1 0.83 0.32 0.46 4306
accuracy 0.79 15064
macro avg 0.81 0.65 0.67 15064
weighted avg 0.80 0.79 0.75 15064
Classification Report (O-SVM vs Ground Truth):
precision recall f1-score support
0 0.78 0.98 0.87 2690
1 0.83 0.31 0.45 1077
accuracy 0.78 3767
macro avg 0.81 0.64 0.66 3767
weighted avg 0.79 0.78 0.75 3767
Classification Report (O-SVM vs Ground Truth):
precision recall f1-score support
0 0.39 0.79 0.52 2152
1 0.68 0.27 0.38 3674
accuracy 0.46 5826
macro avg 0.53 0.53 0.45 5826
weighted avg 0.57 0.46 0.43 5826
Trying anomaly percentage: 0.2
Classification Report (O-SVM trained with normal traffic and 20.0% anomalies vs Ground Truth):
precision recall f1-score support
0 0.82 0.96 0.88 10758
1 0.82 0.47 0.60 4306
accuracy 0.82 15064
macro avg 0.82 0.71 0.74 15064
weighted avg 0.82 0.82 0.80 15064
Classification Report (O-SVM vs Ground Truth):
precision recall f1-score support
0 0.82 0.96 0.89 2690
1 0.84 0.47 0.60 1077
accuracy 0.82 3767
macro avg 0.83 0.72 0.74 3767
weighted avg 0.82 0.82 0.80 3767
Classification Report (O-SVM vs Ground Truth):
precision recall f1-score support
0 0.41 0.75 0.53 2152
1 0.71 0.36 0.48 3674
accuracy 0.50 5826
macro avg 0.56 0.56 0.50 5826
weighted avg 0.60 0.50 0.50 5826
Trying anomaly percentage: 0.5
Classification Report (O-SVM trained with normal traffic and 50.0% anomalies vs Ground Truth):
precision recall f1-score support
0 0.87 0.93 0.90 10758
1 0.79 0.66 0.72 4306
accuracy 0.85 15064
macro avg 0.83 0.79 0.81 15064
weighted avg 0.85 0.85 0.85 15064
Classification Report (O-SVM vs Ground Truth):
precision recall f1-score support
0 0.87 0.93 0.90 2690
1 0.80 0.66 0.72 1077
accuracy 0.86 3767
macro avg 0.84 0.80 0.81 3767
weighted avg 0.85 0.86 0.85 3767
Classification Report (O-SVM vs Ground Truth):
precision recall f1-score support
0 0.49 0.70 0.58 2152
1 0.77 0.57 0.66 3674
accuracy 0.62 5826
macro avg 0.63 0.64 0.62 5826
weighted avg 0.67 0.62 0.63 5826
Trying anomaly percentage: 1
Classification Report (O-SVM trained with normal traffic and 100% anomalies vs Ground Truth):
precision recall f1-score support
0 0.91 0.91 0.91 10758
1 0.76 0.76 0.76 4306
accuracy 0.87 15064
macro avg 0.83 0.83 0.83 15064
weighted avg 0.87 0.87 0.87 15064
Classification Report (O-SVM vs Ground Truth):
precision recall f1-score support
0 0.91 0.91 0.91 2690
1 0.77 0.78 0.77 1077
accuracy 0.87 3767
macro avg 0.84 0.84 0.84 3767
weighted avg 0.87 0.87 0.87 3767
Classification Report (O-SVM vs Ground Truth):
precision recall f1-score support
0 0.63 0.68 0.65 2152
1 0.80 0.76 0.78 3674
accuracy 0.73 5826
macro avg 0.71 0.72 0.72 5826
weighted avg 0.74 0.73 0.73 5826
| anomaly_percentage | F1 train | F1 val | F1test | |
|---|---|---|---|---|
| 0 | 0.0 | 0.94 | 0.93 | 0.76 |
| 1 | 0.1 | 0.67 | 0.66 | 0.45 |
| 2 | 0.2 | 0.74 | 0.74 | 0.50 |
| 3 | 0.5 | 0.81 | 0.81 | 0.62 |
| 4 | 1.0 | 0.83 | 0.84 | 0.72 |
Task 3: Deep Anomaly Detection and Data Representation¶
In [947]:
# Set seeds
seed = 69
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
# Set GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"The device is set to: {device}")
The device is set to: cpu
Training and Validating Autoencoder with Normal data only¶
In [948]:
# Define Autoencoder Architecture
class Autoencoder(nn.Module):
def __init__(self, input_dim, bottleneck_dim=16):
super(Autoencoder, self).__init__()
# Encoder: deeper network, batchnorm, dropout
self.encoder = nn.Sequential(
nn.Linear(input_dim, 128),
nn.BatchNorm1d(128),
nn.ReLU(),
nn.Dropout(0.2),
nn.Linear(128, 64),
nn.BatchNorm1d(64),
nn.ReLU(),
nn.Dropout(0.2),
nn.Linear(64, bottleneck_dim)
)
# Decoder: symmetric structure
self.decoder = nn.Sequential(
nn.Linear(bottleneck_dim, 64),
nn.BatchNorm1d(64),
nn.ReLU(),
nn.Dropout(0.2),
nn.Linear(64, 128),
nn.BatchNorm1d(128),
nn.ReLU(),
nn.Dropout(0.2),
nn.Linear(128, input_dim)
)
def forward(self, x):
z = self.encoder(x) # bottleneck representation
x_hat = self.decoder(z) # reconstruction
return x_hat
def encode(self, x):
return self.encoder(x) # return bottleneck representation
def mixed_autoencoder_loss(x, x_recon, categorical_dims, numerical_idx):
mse = F.mse_loss(x_recon[:, numerical_idx], x[:, numerical_idx])
ce = 0
for start, end in categorical_dims:
x_cat = x[:, start:end]
x_recon_cat = x_recon[:, start:end]
target = torch.argmax(x_cat, dim=1)
ce += F.cross_entropy(x_recon_cat, target)
return mse + ce
In [949]:
# Filter Normal Data and Split into Training and Validation Sets
# Filter normal data (binary_label == 0) from the training set
X_train_normal = df_train_preprocessed[df_train_preprocessed['binary_label'] == 0].drop(columns=['label', 'binary_label'])
X_val_normal = df_val_preprocessed[df_val_preprocessed['binary_label'] == 0].drop(columns=['label', 'binary_label'])
# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train_normal.values, dtype=torch.float32).to(device)
X_val_tensor = torch.tensor(X_val_normal.values, dtype=torch.float32).to(device)
# Create DataLoaders
train_dataset = TensorDataset(X_train_tensor)
val_dataset = TensorDataset(X_val_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
# Initialize model
input_dim = X_train_normal.shape[1]
model = Autoencoder(input_dim=input_dim).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001) # Starting with a reasonable learning rate
# Define categorical and numerical feature indices
all_columns = df_train_preprocessed.drop(columns=['label', 'binary_label']).columns
categorical_dims = []
for cat in ['protocol_type', 'service', 'flag']:
cat_cols = [i for i, col in enumerate(all_columns) if col.startswith(cat)]
if cat_cols:
categorical_dims.append((min(cat_cols), max(cat_cols) + 1))
numerical_idx = [i for i, col in enumerate(all_columns) if not any(col.startswith(cat) for cat in ['protocol_type', 'service', 'flag'])]
# Train and Validate the Autoencoder with Early Stopping
num_epochs = 100
patience = 10 # Number of epochs to wait for improvement
min_delta = 0.001
best_val_loss = float('inf')
patience_counter = 0
best_model_state = None
best_model_epoch = 0
train_losses = []
val_losses = []
epochs_no_improvement = 0
for epoch in tqdm(range(num_epochs), desc="Training"):
# Training phase
model.train()
train_loss = 0
for batch in train_loader:
x = batch[0]
optimizer.zero_grad()
x_hat = model(x)
loss = mixed_autoencoder_loss(x, x_hat, categorical_dims, numerical_idx)
loss.backward()
optimizer.step()
train_loss += loss.item()
train_loss /= len(train_loader)
train_losses.append(train_loss)
# Validation phase
model.eval()
val_loss = 0
with torch.no_grad():
for batch in val_loader:
x = batch[0]
x_hat = model(x)
loss = mixed_autoencoder_loss(x, x_hat, categorical_dims, numerical_idx)
val_loss += loss.item()
val_loss /= len(val_loader)
val_losses.append(val_loss)
# Early stopping
if best_val_loss - val_loss > min_delta:
best_val_loss = val_loss
epochs_no_improvement = 0
torch.save(model.state_dict(), 'best_model.pth') # Saving the best model
else:
epochs_no_improvement += 1
if epochs_no_improvement >= patience:
print(f'Early stopping triggered after {epoch + 1} epochs!')
print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_losses[-1]:.4f}, Val Loss: {val_losses[-1]:.4f}')
break # Stop training
if (epoch + 1) % 20 == 0:
print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_losses[-1]:.4f}, Val Loss: {val_losses[-1]:.4f}')
# Restore best model weights
model.load_state_dict(torch.load('best_model.pth', map_location=device))
plt.figure(figsize=(10, 5))
plt.plot(train_losses, label='Train Loss')
plt.plot(val_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
# Force integer ticks on x‑axis
ax = plt.gca()
ax.xaxis.set_major_locator(ticker.MaxNLocator(integer=True))
plt.show()
Training: 0%| | 0/100 [00:00<?, ?it/s]
Training: 20%|██ | 20/100 [00:10<00:38, 2.09it/s]
Epoch 20/100, Train Loss: 0.4045, Val Loss: 0.5774
Training: 40%|████ | 40/100 [00:19<00:28, 2.07it/s]
Epoch 40/100, Train Loss: 0.2811, Val Loss: 0.2928
Training: 53%|█████▎ | 53/100 [00:26<00:23, 2.00it/s]
Early stopping triggered after 54 epochs! Epoch 54/100, Train Loss: 0.2404, Val Loss: 0.2459
Estimate the Reconstruction Error Threshold¶
In [950]:
X_val_tensor = torch.tensor(X_val_normal.values, dtype=torch.float32).to(device)
model.eval()
with torch.no_grad():
X_val_recon = model(X_val_tensor)
reconstruction_errors = torch.zeros(X_val_tensor.shape[0], device=device)
mse = torch.mean((X_val_tensor[:, numerical_idx] - X_val_recon[:, numerical_idx]) ** 2, dim=1)
for start, end in categorical_dims:
x_cat = X_val_tensor[:, start:end]
x_recon_cat = X_val_recon[:, start:end]
target = torch.argmax(x_cat, dim=1)
ce = F.cross_entropy(x_recon_cat, target, reduction='none')
reconstruction_errors += ce
reconstruction_errors += mse
reconstruction_errors = reconstruction_errors.cpu().numpy()
# Plot ECDF of reconstruction errors
sorted_errors = np.sort(reconstruction_errors)
ecdf = np.arange(1, len(sorted_errors) + 1) / len(sorted_errors)
plt.figure(figsize=(10, 5))
plt.plot(sorted_errors, ecdf, label='ECDF')
plt.xlim(0, np.percentile(sorted_errors, 99)) # Limit x-axis to 99th percentile for better focus
plt.xlabel('Reconstruction Error')
plt.ylabel('ECDF')
plt.title('ECDF of Reconstruction Errors on Validation Set (ONLY normal)')
plt.legend()
plt.grid(True)
plt.show()
threshold_val_normal_only = np.percentile(reconstruction_errors, 95)
print(f"Since we trained on normal data, and since we are estimating using a NORMAL data validation set, we want a threshold that is almost the maximum reconstruction error, for example, a possible threshold can be the 95th percentile of NORMAL validation set: {threshold_val_normal_only}")
Since we trained on normal data, and since we are estimating using a NORMAL data validation set, we want a threshold that is almost the maximum reconstruction error, for example, a possible threshold can be the 95th percentile of NORMAL validation set: 0.4012842655181874
Optional evaluation¶
In [951]:
# Use FULL validation set (normal + anomalies)
X_val_tensor = torch.tensor(df_val_preprocessed.drop(columns=['label', 'binary_label']).values, dtype=torch.float32).to(device)
y_val = df_val_preprocessed['binary_label'].values
model.eval()
with torch.no_grad():
X_val_recon = model(X_val_tensor)
reconstruction_errors = torch.zeros(X_val_tensor.shape[0], device=device)
mse = torch.mean((X_val_tensor[:, numerical_idx] - X_val_recon[:, numerical_idx]) ** 2, dim=1)
for start, end in categorical_dims:
x_cat = X_val_tensor[:, start:end]
x_recon_cat = X_val_recon[:, start:end]
target = torch.argmax(x_cat, dim=1)
ce = F.cross_entropy(x_recon_cat, target, reduction='none')
reconstruction_errors += ce
reconstruction_errors += mse
reconstruction_errors = reconstruction_errors.cpu().numpy()
sorted_errors = np.sort(reconstruction_errors)
ecdf = np.arange(1, len(sorted_errors) + 1) / len(sorted_errors)
plt.figure(figsize=(10, 5))
plt.plot(sorted_errors, ecdf, label='ECDF')
plt.xlim(0, np.percentile(sorted_errors, 99)) # Limit x-axis to 99th percentile for better focus
plt.xlabel('Reconstruction Error')
plt.ylabel('ECDF')
plt.title('ECDF of Reconstruction Errors on ENTIRE Validation Set')
plt.legend()
plt.grid(True)
plt.show()
# Test various percentile thresholds
percentiles = [50, 60, 70, 80, 90, 95]
avg_precision_recall_scores = []
avg_f1_scores = []
thresholds = []
for percentile in percentiles:
threshold = np.percentile(reconstruction_errors, percentile)
y_pred = (reconstruction_errors > threshold).astype(int)
# Calculate metrics
precision_normal = precision_score(y_val, y_pred, pos_label=0)
recall_anomaly = recall_score(y_val, y_pred, pos_label=1)
avg_precision_recall = (precision_normal + recall_anomaly) / 2
f1_normal = f1_score(y_val, y_pred, pos_label=0)
f1_anomaly = f1_score(y_val, y_pred, pos_label=1)
avg_f1 = (f1_normal + f1_anomaly) / 2
avg_precision_recall_scores.append(avg_precision_recall)
avg_f1_scores.append(avg_f1)
thresholds.append(threshold)
print(f"\nPercentile: {percentile}th (Threshold = {threshold:.4f}) on entire validation set (normal + anomalies)")
print(f"Average of Precision (Normal) and Recall (Anomaly): {avg_precision_recall:.4f}")
print(f"Average of F1 (Normal) and F1 (Anomaly): {avg_f1:.4f}")
# Plot avg_precision_recall and avg_f1 vs percentiles in a single plot
plt.figure(figsize=(10, 5))
plt.plot(percentiles, avg_precision_recall_scores, marker='o', label='Avg Precision (Normal) + Recall (Anomaly)')
plt.plot(percentiles, avg_f1_scores, marker='s', label='Avg F1 (Normal + Anomaly)')
plt.xlabel('Percentile_Entire_Val_Set (Reconstruction Error)')
plt.ylabel('Score')
plt.title('Average Precision-Recall and F1 Scores vs Percentile Thresholds (Entire Validation Set)')
plt.xticks(percentiles, [f"{p}th\n({t:.4f})" for p, t in zip(percentiles, thresholds)]) # Show percentile and threshold
plt.legend()
plt.grid(True)
plt.show()
# Select the 70th percentile as the final threshold
threshold = np.percentile(reconstruction_errors, 70)
print(f"\nSelected Reconstruction Error Threshold (70th percentile of ENTIRE validation set): {threshold:.4f}")
print(f"- We found that a high enough threshold (to avoid too many false alarms) with a low false benign rate is the 70th percentile.")
print(f"- Since we trained on normal data only, we don't need an overly aggressive threshold. A moderate threshold like the 70th percentile of the entire validation set (normal + anomalies) minimizes false normal traffic predictions by focusing on high precision for normal and strong recall for anomalies. Since we want to avoid false benign traffic, we can look at the Average of Precision (Normal) and Recall (Anomaly), which for 70th percentile is: {avg_precision_recall_scores[percentiles.index(70)]:.4f}")
print(f"- But we need to consider how often the model wrongly classifies as anomaly to avoid too many false alarms, so we calculate the average of the previous score with the F1-avg score, which for 70th percentile is: {(avg_precision_recall_scores[percentiles.index(70)] + avg_f1_scores[percentiles.index(70)]) / 2:.4f}")
print(f"\n- Hence, our previous estimation of 95th percentile of NORMAL validation set ({threshold_val_normal_only:.4f}), was a plausible threshold.")
Percentile: 50th (Threshold = 0.0638) on entire validation set (normal + anomalies) Average of Precision (Normal) and Recall (Anomaly): 0.9912 Average of F1 (Normal) and F1 (Anomaly): 0.7691 Percentile: 60th (Threshold = 0.1324) on entire validation set (normal + anomalies) Average of Precision (Normal) and Recall (Anomaly): 0.9712 Average of F1 (Normal) and F1 (Anomaly): 0.8486 Percentile: 70th (Threshold = 0.3090) on entire validation set (normal + anomalies) Average of Precision (Normal) and Recall (Anomaly): 0.9189 Average of F1 (Normal) and F1 (Anomaly): 0.9036 Percentile: 80th (Threshold = 0.8656) on entire validation set (normal + anomalies) Average of Precision (Normal) and Recall (Anomaly): 0.7738 Average of F1 (Normal) and F1 (Anomaly): 0.8575 Percentile: 90th (Threshold = 2.6810) on entire validation set (normal + anomalies) Average of Precision (Normal) and Recall (Anomaly): 0.5644 Average of F1 (Normal) and F1 (Anomaly): 0.6915 Percentile: 95th (Threshold = 3.3889) on entire validation set (normal + anomalies) Average of Precision (Normal) and Recall (Anomaly): 0.4582 Average of F1 (Normal) and F1 (Anomaly): 0.5699
Selected Reconstruction Error Threshold (70th percentile of ENTIRE validation set): 0.3090 - We found that a high enough threshold (to avoid too many false alarms) with a low false benign rate is the 70th percentile. - Since we trained on normal data only, we don't need an overly aggressive threshold. A moderate threshold like the 70th percentile of the entire validation set (normal + anomalies) minimizes false normal traffic predictions by focusing on high precision for normal and strong recall for anomalies. Since we want to avoid false benign traffic, we can look at the Average of Precision (Normal) and Recall (Anomaly), which for 70th percentile is: 0.9189 - But we need to consider how often the model wrongly classifies as anomaly to avoid too many false alarms, so we calculate the average of the previous score with the F1-avg score, which for 70th percentile is: 0.9112 - Hence, our previous estimation of 95th percentile of NORMAL validation set (0.4013), was a plausible threshold.
Anomaly Detection with reconstruction error¶
Testing on validation set (normal only)¶
In [952]:
#Test the Model with ONLY normal validation set
X_val_tensor = torch.tensor(X_val_normal.values, dtype=torch.float32).to(device)
model.eval()
with torch.no_grad():
X_val_recon = model(X_val_tensor)
reconstruction_errors = torch.zeros(X_val_tensor.shape[0], device=device)
mse = torch.mean((X_val_tensor[:, numerical_idx] - X_val_recon[:, numerical_idx]) ** 2, dim=1)
for start, end in categorical_dims:
x_cat = X_val_tensor[:, start:end]
x_recon_cat = X_val_recon[:, start:end]
target = torch.argmax(x_cat, dim=1)
ce = F.cross_entropy(x_recon_cat, target, reduction='none')
reconstruction_errors += ce
reconstruction_errors += mse
reconstruction_errors = reconstruction_errors.cpu().numpy()
y_pred_val = (reconstruction_errors > threshold).astype(int)
# Plot ECDF of reconstruction errors
# Plot ECDF of reconstruction errors with a moderate number of x-axis ticks
sorted_errors = np.sort(reconstruction_errors)
ecdf = np.arange(1, len(sorted_errors) + 1) / len(sorted_errors)
plt.figure(figsize=(10, 5))
plt.plot(sorted_errors, ecdf)
plt.xlim(0, np.percentile(sorted_errors, 99)) # Limit x-axis to 99th percentile for better focus
plt.axvline(x=threshold, color='r', linestyle='--', label=f'Threshold = {threshold:.4f}') # Add threshold line
plt.xlabel('Reconstruction Error')
plt.ylabel('ECDF')
plt.title('ECDF of Reconstruction Errors on Validation Set (Only Normal)')
plt.legend()
plt.grid(True)
plt.gca().xaxis.set_major_locator(ticker.MaxNLocator(integer=False, nbins=24)) # Moderate number of ticks
plt.show()
print(f"Mean value of reconstruction errors on (only normal) validation set: {np.mean(reconstruction_errors):.4f}")
Mean value of reconstruction errors on (only normal) validation set: 0.2081
Testing on the full train set¶
In [953]:
#Test the Model with ENTIRE train
X_train_tensor = torch.tensor(df_train_preprocessed.drop(columns=['label', 'binary_label']).values, dtype=torch.float32).to(device)
model.eval()
with torch.no_grad():
X_train_recon = model(X_train_tensor)
reconstruction_errors = torch.zeros(X_train_tensor.shape[0], device=device)
mse = torch.mean((X_train_tensor[:, numerical_idx] - X_train_recon[:, numerical_idx]) ** 2, dim=1)
for start, end in categorical_dims:
x_cat = X_train_tensor[:, start:end]
x_recon_cat = X_train_recon[:, start:end]
target = torch.argmax(x_cat, dim=1)
ce = F.cross_entropy(x_recon_cat, target, reduction='none')
reconstruction_errors += ce
reconstruction_errors += mse
reconstruction_errors = reconstruction_errors.cpu().numpy()
y_pred_train = (reconstruction_errors > threshold).astype(int)
# Plot ECDF of reconstruction errors with a moderate number of x-axis ticks
sorted_errors = np.sort(reconstruction_errors)
ecdf = np.arange(1, len(sorted_errors) + 1) / len(sorted_errors)
plt.figure(figsize=(10, 5))
plt.plot(sorted_errors, ecdf)
plt.xlim(0, np.percentile(sorted_errors, 99)) # Limit x-axis to 99th percentile for better focus
plt.axvline(x=threshold, color='r', linestyle='--', label=f'Threshold = {threshold:.4f}') # Add threshold line
plt.xlabel('Reconstruction Error')
plt.ylabel('ECDF')
plt.title('ECDF of Reconstruction Errors on ENTIRE Train Set')
plt.legend()
plt.grid(True)
plt.gca().xaxis.set_major_locator(ticker.MaxNLocator(integer=False, nbins=24)) # Moderate number of ticks
plt.show()
print(f"Mean value of reconstruction errors on ENTIRE train set: {np.mean(reconstruction_errors):.4f}")
Mean value of reconstruction errors on ENTIRE train set: 0.7250
Testing on the test set¶
In [954]:
#Test the Model with ENTIRE test set
X_test_tensor = torch.tensor(df_test_preprocessed.drop(columns=['label', 'binary_label']).values, dtype=torch.float32).to(device)
model.eval()
with torch.no_grad():
X_test_recon = model(X_test_tensor)
reconstruction_errors = torch.zeros(X_test_tensor.shape[0], device=device)
mse = torch.mean((X_test_tensor[:, numerical_idx] - X_test_recon[:, numerical_idx]) ** 2, dim=1)
for start, end in categorical_dims:
x_cat = X_test_tensor[:, start:end]
x_recon_cat = X_test_recon[:, start:end]
target = torch.argmax(x_cat, dim=1)
ce = F.cross_entropy(x_recon_cat, target, reduction='none')
reconstruction_errors += ce
reconstruction_errors += mse
reconstruction_errors = reconstruction_errors.cpu().numpy()
y_pred_test = (reconstruction_errors > threshold).astype(int)
# Calculate metrics for test set
precision_normal_test = precision_score(y_test, y_pred_test, pos_label=0)
recall_anomaly_test = recall_score(y_test, y_pred_test, pos_label=1)
avg_precision_recall_test = (precision_normal_test + recall_anomaly_test) / 2
f1_normal_test = f1_score(y_test, y_pred_test, pos_label=0)
f1_anomaly_test = f1_score(y_test, y_pred_test, pos_label=1)
avg_f1_test = (f1_normal_test + f1_anomaly_test) / 2
# Plot ECDF of reconstruction errors with a moderate number of x-axis ticks
sorted_errors = np.sort(reconstruction_errors)
ecdf = np.arange(1, len(sorted_errors) + 1) / len(sorted_errors)
plt.figure(figsize=(10, 5))
plt.plot(sorted_errors, ecdf)
plt.xlim(0, np.percentile(sorted_errors, 99)) # Limit x-axis to 99th percentile for better focus
plt.axvline(x=threshold, color='r', linestyle='--', label=f'Threshold = {threshold:.4f}') # Add threshold line
plt.xlabel('Reconstruction Error')
plt.ylabel('ECDF')
plt.title('ECDF of Reconstruction Errors on ENTIRE Test Set')
plt.legend()
plt.grid(True)
plt.gca().xaxis.set_major_locator(ticker.MaxNLocator(integer=False, nbins=24)) # Moderate number of ticks
plt.show()
print(f"Mean value of reconstruction errors on ENTIRE test set: {np.mean(reconstruction_errors):.4f}")
print(f"\nThe validation set has smaller reconstruction errors because right now we have a set that contains only normal traffic.\n"
f"Train set contains both normal and anomalous traffic so it has higher reconstruction errors.\n"
f"Test set not only contains both normal and anomalous traffic but it must have a different kind of traffic (domain shift).")
Mean value of reconstruction errors on ENTIRE test set: 1.9379 The validation set has smaller reconstruction errors because right now we have a set that contains only normal traffic. Train set contains both normal and anomalous traffic so it has higher reconstruction errors. Test set not only contains both normal and anomalous traffic but it must have a different kind of traffic (domain shift).
Confusion matrix full test set¶
In [955]:
#plot_confusion_matrix(y_train, y_pred_train, "Encoder + ReconstructionErrorThresh", "Train (entire)")
#print("----------------------------------------------------------------------------------")
print(f"Average of Precision (Normal) and Recall (Anomaly): {avg_precision_recall_test:.4f}")
print(f"Average of F1 (Normal) and F1 (Anomaly): {avg_f1_test:.4f}")
plot_confusion_matrix(y_test, y_pred_test, "Encoder + ReconstructionErrorThresh", "Test (entire)")
Average of Precision (Normal) and Recall (Anomaly): 0.7658
Average of F1 (Normal) and F1 (Anomaly): 0.7270
Classification Report (Encoder + ReconstructionErrorThresh vs Ground Truth):
precision recall f1-score support
0 0.69 0.60 0.64 2152
1 0.78 0.84 0.81 3674
accuracy 0.75 5826
macro avg 0.74 0.72 0.73 5826
weighted avg 0.75 0.75 0.75 5826
Auto-Encoder’s bottleneck and OC-SVM¶
In [956]:
# Define function to extract bottleneck embeddings
def get_bottleneck_embeddings(model, data_tensor):
model.eval()
with torch.no_grad():
embeddings = model.encoder(data_tensor).cpu().numpy()
return embeddings
# Extract bottleneck embeddings for normal training data
X_train_normal_tensor = torch.tensor(df_train_preprocessed[df_train_preprocessed['binary_label'] == 0].drop(columns=['label', 'binary_label']).values, dtype=torch.float32).to(device)
X_train_embeddings = get_bottleneck_embeddings(model, X_train_normal_tensor)
# Train OC-SVM on bottleneck embeddings of normal data
ocsvm = OneClassSVM(kernel='rbf', gamma=0.1, nu=0.001) # same param as task 2 for normal only
ocsvm.fit(X_train_embeddings)
# Extract bottleneck embeddings for test set
X_test_tensor = torch.tensor(df_test_preprocessed.drop(columns=['label', 'binary_label']).values, dtype=torch.float32).to(device)
X_test_embeddings = get_bottleneck_embeddings(model, X_test_tensor)
y_test = df_test_preprocessed['binary_label'].values
# Predict using OC-SVM on test set
y_test_pred = ocsvm.predict(X_test_embeddings)
y_test_pred_binary = (y_test_pred == -1).astype(int)
# Calculate metrics for test set
precision_normal_test = precision_score(y_test, y_test_pred_binary, pos_label=0)
recall_anomaly_test = recall_score(y_test, y_test_pred_binary, pos_label=1)
avg_precision_recall_test = (precision_normal_test + recall_anomaly_test) / 2
f1_normal_test = f1_score(y_test, y_test_pred_binary, pos_label=0)
f1_anomaly_test = f1_score(y_test, y_test_pred_binary, pos_label=1)
avg_f1_test = (f1_normal_test + f1_anomaly_test) / 2
# Print classification report and metrics for test set
plot_confusion_matrix(y_test, y_test_pred_binary, "OC-SVM on Bottleneck Embeddings vs Ground Truth", "Test")
print(f"Average of Precision (Normal) and Recall (Anomaly): {avg_precision_recall_test:.4f}")
print(f"Average of F1 (Normal) and F1 (Anomaly): {avg_f1_test:.4f}")
Classification Report (OC-SVM on Bottleneck Embeddings vs Ground Truth vs Ground Truth):
precision recall f1-score support
0 0.67 0.67 0.67 2152
1 0.81 0.80 0.80 3674
accuracy 0.75 5826
macro avg 0.74 0.74 0.74 5826
weighted avg 0.75 0.75 0.75 5826
Average of Precision (Normal) and Recall (Anomaly): 0.7346 Average of F1 (Normal) and F1 (Anomaly): 0.7363
PCA and OC-SVM¶
PCA Analysis to Find Elbow Point¶
In [957]:
# Extract normal training data
X_train_normal = df_train_preprocessed[df_train_preprocessed['binary_label'] == 0].drop(columns=['label', 'binary_label'])
# PCA Analysis to Find Elbow Point
# Fit PCA on normal training data
pca = PCA()
pca.fit(X_train_normal)
# Calculate cumulative explained variance:
# pca.explained_variance_ratio_[i] is the fraction of total variance
# captured by the i‑th principal component (components are ordered by descending variance).
# np.cumsum(...) then gives, for k=1,2,…, the cumulative variance explained by the first k components.
explained_variance_ratio = np.cumsum(pca.explained_variance_ratio_)
# Plot explained variance to find elbow point
plt.figure(figsize=(10, 5))
plt.plot(range(1, len(explained_variance_ratio) + 1), explained_variance_ratio, marker='o')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.title('Explained Variance vs Number of PCA Components')
plt.grid(True)
plt.axhline(y=0.95, color='r', linestyle='--', label='95% Variance')
plt.legend()
plt.show()
# Find elbow point (number of components explaining ~95% variance)
n_components = np.argmax(explained_variance_ratio >= 0.95) + 1
print(f"Selected number of PCA components (explaining ~95% variance): {n_components}")
Selected number of PCA components (explaining ~95% variance): 21
Fit and Transform on normal train set and trasform the test set¶
In [958]:
# Fit and Transform Normal Training Data, Transform Test Data
# Fit PCA with selected number of components
pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train_normal)
# Transform test set
X_test = df_test_preprocessed.drop(columns=['label', 'binary_label'])
X_test_pca = pca.transform(X_test)
y_test = df_test_preprocessed['binary_label'].values
Train OC-SVM¶
In [959]:
# Train OC-SVM on PCA Components of Normal Data
ocsvm = OneClassSVM(kernel='rbf', gamma=0.1, nu=0.001) # using values used in task 2 for normal only
ocsvm.fit(X_train_pca)
# Evaluate OC-SVM on Test Set
y_test_pred = ocsvm.predict(X_test_pca)
y_test_pred_binary = (y_test_pred == -1).astype(int) # Convert -1 (anomaly) to 1, 1 (normal) to 0
# Calculate metrics
precision_normal = precision_score(y_test, y_test_pred_binary, pos_label=0)
recall_anomaly = recall_score(y_test, y_test_pred_binary, pos_label=1)
avg_precision_recall = (precision_normal + recall_anomaly) / 2
f1_normal = f1_score(y_test, y_test_pred_binary, pos_label=0)
f1_anomaly = f1_score(y_test, y_test_pred_binary, pos_label=1)
avg_f1 = (f1_normal + f1_anomaly) / 2
# Print classification report and metrics
print("\nClassification Report (OC-SVM on PCA Components vs Ground Truth) on Test Set:")
print(classification_report(y_test, y_test_pred_binary, target_names=['Normal', 'Anomaly']))
print(f"Average of Precision (Normal) and Recall (Anomaly): {avg_precision_recall:.4f}")
print(f"Average of F1 (Normal) and F1 (Anomaly): {avg_f1:.4f}")
# Plot confusion matrix
cm = confusion_matrix(y_test, y_test_pred_binary)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Normal', 'Anomaly'], yticklabels=['Normal', 'Anomaly'])
plt.title('Confusion Matrix - OC-SVM on PCA Components (Test Set)')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()
Classification Report (OC-SVM on PCA Components vs Ground Truth) on Test Set:
precision recall f1-score support
Normal 0.66 0.77 0.71 2152
Anomaly 0.85 0.77 0.81 3674
accuracy 0.77 5826
macro avg 0.76 0.77 0.76 5826
weighted avg 0.78 0.77 0.77 5826
Average of Precision (Normal) and Recall (Anomaly): 0.7176
Average of F1 (Normal) and F1 (Anomaly): 0.7609
Task 4: Unsupervised Anomaly Detection and Interpretation¶
In [34]:
# Set seeds
seed = 69
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
In [35]:
def compute_sse(X, centroids, labels):
unique_labels = np.unique(labels)
sse = 0.0
for label in unique_labels:
if label == -1:
continue # Skip noise points (e.g., in DBSCAN)
cluster_points = X[labels == label]
centroid = centroids[label]#cluster_points.mean(axis=0)
sse += np.sum((cluster_points.values - centroid) ** 2)
return sse
def compute_silhouette_details(X, labels):
# Remove noise points if any (e.g., DBSCAN)
mask = labels != -1
X_clean = X[mask]
labels_clean = labels[mask]
# Compute overall silhouette score
silhouette_avg = silhouette_score(X_clean, labels_clean)
# Compute per-sample silhouette scores
sample_silhouette_values = silhouette_samples(X_clean, labels_clean)
# Aggregate by cluster
cluster_silhouettes = {}
for cluster in np.unique(labels_clean):
cluster_silhouettes[cluster] = sample_silhouette_values[labels_clean == cluster].mean()
return silhouette_avg, cluster_silhouettes, sample_silhouette_values
def plot_silhouette(silhouette_avg,sample_silhouette_values,cluster_labels):
distinct_labels = set(cluster_labels)
n_clusters = len(distinct_labels)
if(-1 in distinct_labels):
n_clusters -=1
print("For n_clusters =",n_clusters,"The average silhouette_score is :",silhouette_avg)
fig, ax = plt.subplots(figsize=(8, 6))
y_lower = 10
for i in range(n_clusters):
# Aggregate the silhouette scores for samples belonging to
# cluster i, and sort them
ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]
print("cluster",i,"size",len(ith_cluster_silhouette_values))
ith_cluster_silhouette_values.sort()
size_cluster_i = ith_cluster_silhouette_values.shape[0]
y_upper = y_lower + size_cluster_i
color = plt.cm.nipy_spectral(float(i) / n_clusters)
ax.fill_betweenx(
np.arange(y_lower, y_upper),
0,
ith_cluster_silhouette_values,
facecolor=color,
edgecolor=color,
alpha=0.7,
)
# Label the silhouette plots with their cluster numbers at the middle
ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
# Compute the new y_lower for next plot
y_lower = y_upper + 10 # 10 for the 0 samples
ax.set_title("The silhouette plot for the various clusters.")
ax.set_xlabel("The silhouette coefficient values")
ax.set_ylabel("Cluster label")
# The vertical line for average silhouette score of all the values
ax.axvline(x=silhouette_avg, color="red", linestyle="--")
ax.set_yticks([]) # Clear the yaxis labels / ticks
ax.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
plt.show()
plt.close()
return
def plot_tsne(df, perplexity, lbls):
tsne = TSNE(n_components=2, random_state=seed, perplexity=perplexity)
X_tsne = tsne.fit_transform(df.drop(columns=['label','binary_label','cluster_label']))
labels = list(df[lbls])
df_tsne = pd.DataFrame(X_tsne, columns=['TSNE1', 'TSNE2'])
df_tsne['label'] = labels
plt.figure(figsize=(8, 6))
sns.scatterplot(
data=df_tsne, x='TSNE1', y='TSNE2',
hue='label', palette='Set1', alpha=0.8
)
plt.title('2D t-SNE Projection')
plt.xlabel('t-SNE 1')
plt.ylabel('t-SNE 2')
plt.legend(title='Label')
plt.show()
return
K-means with little domain knowledge¶
In [36]:
X_train = df_train_preprocessed.drop(columns=['label', 'binary_label'])
y_original_train = list(df_train_preprocessed['label'])
y_train = list(df_train_preprocessed['binary_label'])
# non so che dataset usare :)
#df_preprocessed = pd.concat([df_train_preprocessed, df_val_preprocessed], axis=0, ignore_index=True)
#X_train = df_preprocessed.drop(columns=['label', 'binary_label'])
#y_original_train = list(df_preprocessed['label'])
#y_train = list(df_preprocessed['binary_label'])
# Number of times the k-means algorithm is run with different centroid seeds. The final results is the best output of n_init consecutive runs in terms of inertia
kmeans = KMeans(n_clusters=4, n_init=50)
kmeans_labels = kmeans.fit_predict(X_train)
kmeans_centers = kmeans.cluster_centers_
cluster_labels = kmeans.labels_
sse_avg = compute_sse(X_train,kmeans_centers,kmeans_labels)
silhouette_avg, clusters_silhouettes, sample_silhouette_values = compute_silhouette_details(X_train, kmeans_labels)
K-means cluster interpretation¶
In [37]:
print(f"K-Means SSE: {sse_avg:.2f}")
print(f"K-Means Silhouette Score: {silhouette_avg:.2f}")
plot_silhouette(silhouette_avg,sample_silhouette_values,cluster_labels)
sil_df = pd.Series(clusters_silhouettes, name="Silhouette Score").round(4).rename_axis("Cluster").to_frame()
display(sil_df)
label_distribution = pd.crosstab(kmeans_labels, y_original_train, normalize="index").round(2)
display(label_distribution)
label_distribution_number = pd.crosstab(kmeans_labels, y_original_train).round(2)
display(label_distribution_number)
K-Means SSE: 338638.16 K-Means Silhouette Score: 0.46 For n_clusters = 4 The average silhouette_score is : 0.4631150785092281 cluster 0 size 12110 cluster 1 size 1522 cluster 2 size 7 cluster 3 size 1425
| Silhouette Score | |
|---|---|
| Cluster | |
| 0 | 0.4543 |
| 1 | 0.2666 |
| 2 | 0.4766 |
| 3 | 0.7482 |
| col_0 | dos | normal | probe | r2l |
|---|---|---|---|---|
| row_0 | ||||
| 0 | 0.06 | 0.85 | 0.08 | 0.01 |
| 1 | 0.17 | 0.29 | 0.54 | 0.00 |
| 2 | 0.00 | 1.00 | 0.00 | 0.00 |
| 3 | 0.93 | 0.02 | 0.05 | 0.00 |
| col_0 | dos | normal | probe | r2l |
|---|---|---|---|---|
| row_0 | ||||
| 0 | 736 | 10287 | 943 | 144 |
| 1 | 264 | 435 | 823 | 0 |
| 2 | 0 | 7 | 0 | 0 |
| 3 | 1330 | 29 | 65 | 1 |
Use the t-SNE algorithm to obtain a 2D visualization of your points¶
In [38]:
df_clusters = pd.DataFrame(X_train.copy())
df_clusters["cluster_label"] = kmeans_labels
df_clusters["label"] = y_original_train
df_clusters["binary_label"] = y_train
for perplexity in [10, 25, 40]:
print(f"Perplexity: {perplexity}")
plot_tsne(df_clusters, perplexity, 'cluster_label')
Perplexity: 10
Perplexity: 25
Perplexity: 40
In [39]:
print(f"Perplexity 25")
plot_tsne(df_clusters, 25, 'label')
Perplexity 25
DB-Scan anomalies are anomalies?¶
Estimate min points¶
In [40]:
# The smallest cluster that consists only of normal data is the cluster 3 which contains 7 normal points
min_points = 7 # estimate
Estimate epsilon¶
In [41]:
# K-distance elbow plot per eps
num_neighbors = 4
nn = NearestNeighbors(n_neighbors=num_neighbors, algorithm='auto', n_jobs=-1)
nn.fit(X_train)
# Compute distances to n nearest neighbors (including the point itself)
distances, _ = nn.kneighbors(X_train)
# Distance to the 4th nearest neighbor (not counting self, which is at index 0)
nth_neighbor_distances = distances[:, num_neighbors-1] # n th closest = n-1 th real neighbor
# Now fourth_neighbor_distances[i] gives you the distance of point i to its 4th neighbor
# Plot explained variance ratio
plt.figure(figsize=(8, 5))
plt.plot(sorted(nth_neighbor_distances)) ## Sorted in ascending order
plt.xlabel('Core Points')
plt.ylabel('Distance to its 4th neirest neighbour')
plt.ylim(0,0.5)
plt.grid(True)
plt.show()
plt.close()
# epsilon is estimated using the elbow rule, e = 0.15
Create clustering results¶
In [42]:
# Use DBSCAN with the epsilon and min_points previosly defined
dbscan = DBSCAN(eps=0.15, min_samples=10)
dbscan_labels = dbscan.fit_predict(X_train) # -1 is noise cluster - 0,1,2 are the other clusters
In [43]:
# dbscan_labels == -1 is True for every point DBSCAN classified as noise and False for every point in a cluster.
table = pd.crosstab(dbscan_labels == -1, y_original_train, normalize="index")
print(f"Noise cluster:\n{table}")
# about 70% of points flagged as noise are normal so dbscan has poor performance in this case
Noise cluster: col_0 dos normal probe r2l row_0 False 0.165857 0.738831 0.084810 0.010503 True 0.146459 0.696028 0.148532 0.008981
In [44]:
df_clusters = pd.DataFrame({
'cluster_label': dbscan_labels,
'label': y_original_train,
'binary_label': y_train
})
df_clusters = df_clusters[df_clusters['cluster_label'] != -1] # Filters -1 cluster
top10 = df_clusters['cluster_label'].value_counts().nlargest(10).index # Find the 10 largest clusters
df_top10 = df_clusters[df_clusters['cluster_label'].isin(top10)] # Keep only top 10 clusters
dist_table = pd.crosstab(df_top10['cluster_label'], df_top10['label'], normalize="index").round(3)
print("Label distribution per DBSCAN cluster:")
print(dist_table)
Label distribution per DBSCAN cluster: label dos normal probe cluster_label 0 0.0 0.0 1.0 1 0.0 1.0 0.0 3 0.0 1.0 0.0 4 0.0 1.0 0.0 6 1.0 0.0 0.0 8 0.0 1.0 0.0 9 1.0 0.0 0.0 11 1.0 0.0 0.0 20 0.0 1.0 0.0 28 1.0 0.0 0.0
Use the t-SNE for visualization¶
In [ ]:
df_vis = X_train.copy()
df_vis['cluster_label'] = dbscan_labels
df_vis['label'] = y_original_train
df_vis['binary_label'] = y_train
df_vis = df_vis[df_vis['cluster_label'].isin(top10)].copy() # Keep only top 10 clusters
def plot_tsne(df, perplexity, lbls): # updated function because otherwise we would have same color sometimes
tsne = TSNE(n_components=2, random_state=seed, perplexity=perplexity)
X_tsne = tsne.fit_transform(df.drop(columns=['label', 'binary_label', 'cluster_label']))
labels = list(df[lbls])
df_tsne = pd.DataFrame(X_tsne, columns=['TSNE1', 'TSNE2'])
df_tsne['label'] = labels
unique_labels = np.unique(labels)
n_colors = len(unique_labels)
if n_colors <= 10:
palette = [
'#FF0000', '#00FF00', '#0000FF', '#FF00FF', '#00FFFF',
'#FFA500', '#800080', '#008000', '#FFC0CB', '#FFFF00'
][:n_colors]
else:
cmap = plt.cm.get_cmap('hsv', n_colors)
palette = [cmap(i) for i in range(n_colors)]
plt.figure(figsize=(8, 6))
sns.scatterplot(
data=df_tsne,
x='TSNE1',
y='TSNE2',
hue='label',
palette=palette,
alpha=0.8,
edgecolor='white',
linewidth=0.5
)
plt.title('2D t-SNE Projection')
plt.xlabel('t-SNE 1')
plt.ylabel('t-SNE 2')
plt.legend(title='Label', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()
return
# Plot using cluster labels
plot_tsne(df_vis, perplexity=25, lbls='cluster_label')
# Plot using attack labels
plot_tsne(df_vis, perplexity=25, lbls='label')